datadir="/rds/general/user/la420/home/CUTnTAG/antibodies/bowtie2_summary/"
sampleList = c("ActiveMotif", "Diagenode_100x", "Diagenode_50x", "Abcam-ab177178", "Abcam-ab4729", "H3K27me3")
## collect the alignment results from the bowtie2 alignment summary files
alignResult = c()
for(sample in sampleList){
alignRes = read.table(paste0(datadir, sample, "_trimmed_bowtie2.txt"), header = FALSE, fill = TRUE)
alignRate = substr(alignRes$V1[6], 1, nchar(as.character(alignRes$V1[6]))-1)
alignResult = data.frame(Antibody = sample, SequencingDepth = alignRes$V1[1] %>% as.character %>% as.numeric,
MappedFragNum_hg19 = alignRes$V1[4] %>% as.character %>% as.numeric + alignRes$V1[5] %>% as.character %>% as.numeric,
AlignmentRate_hg19 = alignRate %>% as.numeric) %>% rbind(alignResult, .)
}
alignResult %>% mutate(AlignmentRate_hg19 = paste0(AlignmentRate_hg19, "%"))## summarize the duplication information from the picard summary outputs
picarddir = "/rds/general/user/la420/home/CUTnTAG/antibodies/picard_summary/"
dupResult = c()
for(sample in sampleList){
dupRes = read.table(paste0(picarddir, sample, "_picard.rmDup.txt"), header = TRUE, fill = TRUE)
dupResult = data.frame(Antibody = sample, MappedFragNum_hg19 = dupRes$READ_PAIRS_EXAMINED[1] %>% as.character %>% as.numeric, DuplicationRate = dupRes$PERCENT_DUPLICATION[1] %>% as.character %>% as.numeric * 100) %>% mutate(UniqueFragNum = (MappedFragNum_hg19 * (1-DuplicationRate/100)) %>% round()) %>% rbind(dupResult, .)
}
alignDupSummary = left_join(alignResult, dupResult, by = c("Antibody", "MappedFragNum_hg19")) %>% mutate(DuplicationRate = paste0(DuplicationRate, "%"))
alignDupSummary## collect the fragment size information
fragdir = "/rds/general/user/la420/home/CUTnTAG/antibodies/fragmentLen/"
sampleList = c("ActiveMotif", "Diagenode_100x", "Diagenode_50x", "Abcam-ab177178", "Abcam-ab4729", "H3K27me3")
fragLen = c()
for(sample in sampleList){
fragLen = read.table(paste0(fragdir, sample, "_rmDup_fragmentLen.txt"), header = FALSE) %>% mutate(fragLen = V1 %>% as.numeric, fragCount = V2 %>% as.numeric, Weight = as.numeric(V2)/sum(as.numeric(V2)), Antibody = sample) %>% rbind(fragLen, .)
}
fig5A = fragLen %>% ggplot(aes(x = Antibody, y = fragLen, weight = Weight, fill = Antibody)) +
geom_violin(bw = 5) +
scale_y_continuous(breaks = seq(0, 800, 50)) +
scale_fill_viridis(discrete = TRUE, begin = 0.1, end = 0.9, option = "magma", alpha = 0.8) +
scale_color_viridis(discrete = TRUE, begin = 0.1, end = 0.9) +
theme_bw(base_size = 20) +
ggpubr::rotate_x_text(angle = 20) +
ylab("Fragment Length") +
xlab("")
fig5B = fragLen %>% ggplot(aes(x = fragLen, y = fragCount, color = Antibody, group = Antibody)) +
geom_line(size = 1) +
scale_color_viridis(discrete = TRUE, begin = 0.1, end = 0.9, option = "magma") +
theme_bw(base_size = 20) +
xlab("Fragment Length") +
ylab("Count") +
coord_cartesian(xlim = c(0, 500))
ggarrange(fig5A, fig5B, ncol = 2)fragdir = "/rds/general/user/la420/home/CUTnTAG/antibodies_withDup/fragmentLen/"
fragLen = c()
for(sample in sampleList){
fragLen = read.table(paste0(fragdir, sample, "_withDup_fragmentLen.txt"), header = FALSE) %>% mutate(fragLen = V1 %>% as.numeric, fragCount = V2 %>% as.numeric, Weight = as.numeric(V2)/sum(as.numeric(V2)), Antibody = sample) %>% rbind(fragLen, .)
}
fig5A = fragLen %>% ggplot(aes(x = Antibody, y = fragLen, weight = Weight, fill = Antibody)) +
geom_violin(bw = 5) +
scale_y_continuous(breaks = seq(0, 800, 50)) +
scale_fill_viridis(discrete = TRUE, begin = 0.1, end = 0.9, option = "viridis", alpha = 0.8) +
scale_color_viridis(discrete = TRUE, begin = 0.1, end = 0.9) +
theme_bw(base_size = 20) +
ggpubr::rotate_x_text(angle = 20) +
ylab("Fragment Length") +
xlab("")
fig5B = fragLen %>% ggplot(aes(x = fragLen, y = fragCount, color = Antibody, group = Antibody)) +
geom_line(size = 1) +
scale_color_viridis(discrete = TRUE, begin = 0.1, end = 0.9, option = "viridis") +
theme_bw(base_size = 20) +
xlab("Fragment Length") +
ylab("Count") +
coord_cartesian(xlim = c(0, 500))
ggarrange(fig5A, fig5B, ncol = 2)peakdir = "/rds/general/user/la420/home/CUTnTAG/antibodies/all_peaks/"
sampleList_all = c("ActiveMotif_SEACR", "ActiveMotif_MACS2", "Diagenode_100x_SEACR", "Diagenode_100x_MACS2", "Diagenode_50x_SEACR", "Diagenode_50x_MACS2", "Abcam-ab177178_SEACR", "Abcam-ab177178_MACS2", "Abcam-ab4729_SEACR", "Abcam-ab4729_MACS2", "H3K27me3_SEACR", "H3K27me3_MACS2")
peakList = list()
for(sample in sampleList_all){
path = paste0(peakdir, sample, ".bed")
peak = ChIPseeker::readPeakFile(file.path(path), as = "GRanges") %>% BRGenomics::tidyChromosomes(keep.X = TRUE, keep.Y = TRUE)
peakList = c(peakList, peak)
}
peakList = setNames(peakList, sampleList_all)Total_CT_peaks = c()
for(peak in peakList){
peakN = length(peak)
Total_CT_peaks = c(Total_CT_peaks, peakN)
}
Total_CT_peaks = data.frame(Antibody = sampleList_all, Total_PeakN = Total_CT_peaks)
Total_CT_peakspeakWidth = c()
for(peak in peakList){
peakW = GenomicRanges::width(peak) %>% summary() %>% round() %>% list()
peakWidth = c(peakWidth, peakW)
}
WSummary = c()
for(i in 1:length(peakWidth)){
WSummary = data.frame(Antibody = sampleList_all[i], MinW = array(peakWidth[[i]][1]), MedianW = array(peakWidth[[i]][3]), MeanW = array(peakWidth[[i]][4]), MaxW = array(peakWidth[[i]][6])) %>% rbind(., WSummary)
}
WSummaryWidthSummary = data.frame(Antibody = sampleList, MACS2_MedianW = WSummary$MedianW[c(1, 3, 5, 7, 9, 11)], MACS2_MeanW = WSummary$MeanW[c(1, 3, 5, 7, 9, 11)], SEACR_MedianW = WSummary$MedianW[c(2, 4, 6, 8, 10, 12)], SEACR_MeanW = WSummary$MeanW[c(2, 4, 6, 8, 10, 12)])
SeqWidthSummary = left_join(alignDupSummary, WidthSummary, by = "Antibody")
SeqWidthSummary = SeqWidthSummary[c("Antibody", "SequencingDepth", "MACS2_MedianW", "MACS2_MeanW", "SEACR_MedianW", "SEACR_MeanW")]
SeqWidthSummaryMACS2_peaks = data.frame(Antibody = sampleList, PeakN_MACS2 = Total_CT_peaks$Total_PeakN[c(2, 4, 6, 8, 10, 12)])
SEACR_peaks = data.frame(Antibody = sampleList, PeakN_SEACR = Total_CT_peaks$Total_PeakN[c(1, 3, 5, 7, 9, 11)])
PeaksAlignmentSummary = left_join(alignDupSummary, SEACR_peaks, by = "Antibody") %>% left_join(MACS2_peaks, by = "Antibody")
PeaksAlignmentSummary = PeaksAlignmentSummary[c("Antibody", "SequencingDepth", "MappedFragNum_hg19", "UniqueFragNum", "DuplicationRate", "PeakN_SEACR", "PeakN_MACS2")]
PeaksAlignmentSummaryggplot(PeaksAlignmentSummary, aes(x = UniqueFragNum, y = PeakN_SEACR)) + geom_point(aes(size = DuplicationRate)) + geom_text(aes(label = Antibody), nudge_y = 500, hjust = "inward")
ggplot(PeaksAlignmentSummary, aes(x = SequencingDepth, y = PeakN_SEACR)) + geom_point(aes(size = DuplicationRate)) + geom_text(aes(label = Antibody), nudge_y = 500, hjust = "inward")
ggplot(SeqWidthSummary, aes(x = SequencingDepth, y = SEACR_MedianW)) + geom_point() + geom_text(aes(label = Antibody), nudge_y = 100, hjust = "inward")ggplot(PeaksAlignmentSummary, aes(x = UniqueFragNum, y = PeakN_MACS2)) + geom_point(aes(size = DuplicationRate)) + geom_text(aes(label = Antibody), nudge_y = 500, hjust = "inward")
ggplot(PeaksAlignmentSummary, aes(x = SequencingDepth, y = PeakN_MACS2)) + geom_point(aes(size = DuplicationRate)) + geom_text(aes(label = Antibody), nudge_y = 500, hjust = "inward")
ggplot(SeqWidthSummary, aes(x = SequencingDepth, y = MACS2_MedianW)) + geom_point() + geom_text(aes(label = Antibody), nudge_y = 10, hjust = "inward")extraCols_narrowPeak = c(signalValue = "numeric", pValue = "numeric", qValue = "numeric", peak = "integer")
ENCODE_H3K27ac = rtracklayer::import("https://www.encodeproject.org/files/ENCFF044JNJ/@@download/ENCFF044JNJ.bed.gz", format = "BED", extraCols = extraCols_narrowPeak)
ENCODE_H3K27me3 = rtracklayer::import("https://www.encodeproject.org/files/ENCFF126QYP/@@download/ENCFF126QYP.bed.gz", format = "BED", extraCols = extraCols_narrowPeak)
# ENCODE_H3K27me3 = rtracklayer::import("https://www.encodeproject.org/files/ENCFF322IFF/@@download/ENCFF322IFF.bed.gz", format = "BED", extraCols = extraCols_narrowPeak) - has ~1/2 the peaksdatadir = "/rds/general/user/la420/home/CUTnTAG/antibodies"
SEACR_noDup_inPeakData = c()
for(sample in sampleList){
peakRes = read.table(paste0(datadir, "/SEACR/", sample, "_seacr_top0.01.peaks.stringent.bed"), header = FALSE, fill = TRUE)
peak.gr = GenomicRanges::GRanges(seqnames = peakRes$V1, IRanges(start = peakRes$V2, end = peakRes$V3), strand = "*")
bamFile = paste0(datadir, "/bam/", sample, "_bowtie2_rmDup.mapped.bam")
fragment_counts = chromVAR::getCounts(bamFile, peak.gr, paired = TRUE, by_rg = FALSE, format = "bam")
SEACR_noDup_inPeakN = counts(fragment_counts)[,1] %>% sum
SEACR_noDup_inPeakData = rbind(SEACR_noDup_inPeakData, data.frame(Antibody = sample, SEACR_noDup_inPeakN = SEACR_noDup_inPeakN))
}MACS2_inPeakData = c()
for(sample in sampleList){
peakRes = read.table(paste0(datadir, "/MACS2/", sample, "_q0.05_peaks.narrowPeak"), header = FALSE, fill = TRUE)
peak.gr = GenomicRanges::GRanges(seqnames = peakRes$V1, IRanges(start = peakRes$V2, end = peakRes$V3), strand = "*")
bamFile = paste0(datadir, "/bam/", sample, "_bowtie2_rmDup.mapped.bam")
fragment_counts = chromVAR::getCounts(bamFile, peak.gr, paired = TRUE, by_rg = FALSE, format = "bam")
MACS2_noDup_inPeakN = counts(fragment_counts)[,1] %>% sum
MACS2_noDup_inPeakData = rbind(MACS2_noDup_inPeakData, data.frame(Antibody = sample, MACS2_noDup_inPeakN = MACS2_noDup_inPeakN))
}ENCODE_noDup_inPeakData = c()
peakRes = data.frame(ENCODE_H3K27ac)
for(sample in sampleList){
peak.gr = GenomicRanges::GRanges(seqnames = peakRes$seqnames, IRanges(start = peakRes$start, end = peakRes$end), strand = "*")
bamFile = paste0(datadir, "/bam/", sample, "_bowtie2_rmDup.mapped.bam")
fragment_counts = chromVAR::getCounts(bamFile, peak.gr, paired = TRUE, by_rg = FALSE, format = "bam")
ENCODE_noDup_inPeakN = counts(fragment_counts)[,1] %>% sum
ENCODE_noDup_inPeakData = rbind(ENCODE_noDup_inPeakData, data.frame(Antibody = sample, ENCODE_noDup_inPeakN = ENCODE_noDup_inPeakN))
}datadir = "/rds/general/user/la420/home/CUTnTAG/antibodies_withDup"
SEACR_withDup_inPeakData = c()
for(sample in sampleList){
peakRes = read.table(paste0(datadir, "/SEACR/", sample, "_withDup_seacr_top0.01.peaks.stringent.bed"), header = FALSE, fill = TRUE)
peak.gr = GenomicRanges::GRanges(seqnames = peakRes$V1, IRanges(start = peakRes$V2, end = peakRes$V3), strand = "*")
bamFile = paste0(datadir, "/bam/mapped_bam/", sample, "_withDup_bowtie2.mapped.bam")
fragment_counts = chromVAR::getCounts(bamFile, peak.gr, paired = TRUE, by_rg = FALSE, format = "bam")
SEACR_withDup_inPeakN = counts(fragment_counts)[,1] %>% sum
SEACR_withDup_inPeakData = rbind(SEACR_withDup_inPeakData, data.frame(Antibody = sample, SEACR_withDup_inPeakN = SEACR_withDup_inPeakN))
}MACS2_withDup_inPeakData = c()
for(sample in sampleList){
peakRes = read.table(paste0(datadir, "/MACS2/", sample, "_withDup_q0.05_peaks.narrowPeak"), header = FALSE, fill = TRUE)
peak.gr = GenomicRanges::GRanges(seqnames = peakRes$V1, IRanges(start = peakRes$V2, end = peakRes$V3), strand = "*")
bamFile = paste0(datadir, "/bam/mapped_bam/", sample, "_withDup_bowtie2.mapped.bam")
fragment_counts = chromVAR::getCounts(bamFile, peak.gr, paired = TRUE, by_rg = FALSE, format = "bam")
MACS2_withDup_inPeakN = counts(fragment_counts)[,1] %>% sum
MACS2_withDup_inPeakData = rbind(MACS2_withDup_inPeakData, data.frame(Antibody = sample, MACS2_withDup_inPeakN = MACS2_withDup_inPeakN))
}ENCODE_withDup_inPeakData = c()
for(sample in sampleList){
bamFile = paste0(datadir, "/bam/mapped_bam/" sample, "_withDup_bowtie2.mapped.bam")
fragment_counts = chromVAR::getCounts(bamFile, peak.gr, paired = TRUE, by_rg = FALSE, format = "bam")
ENCODE_withDup_inPeakN = counts(fragment_counts)[,1] %>% sum
ENCODE_withDup_inPeakData = rbind(ENCODE_withDup_inPeakData, data.frame(Antibody = sample, ENCODE_withDup_inPeakN = ENCODE_withDup_inPeakN))
}FRiP_data = alignDupSummary[c("Antibody", "MappedFragNum_hg19", "UniqueFragNum")] %>% left_join(SEACR_noDup_inPeakData, by = "Antibody") %>% left_join(SEACR_withDup_inPeakData, by = "Antibody") %>% left_join(MACS2_noDup_inPeakData, by = "Antibody") %>% left_join(MACS2_withDup_inPeakData, by = "Antibody") %>% left_join(ENCODE_noDup_inPeakData, by = "Antibody") %>% left_join(ENCODE_withDup_inPeakData, by = "Antibody")
FRiP_data$SEACR_noDup_FRiP = FRiP_data$SEACR_noDup_inPeakN/FRiP_data$UniqueFragNum * 100
FRiP_data$SEACR_withDup_FRiP = FRiP_data$SEACR_withDup_inPeakN/FRiP_data$MappedFragNum_hg19 * 100
FRiP_data$MACS2_noDup_FRiP = FRiP_data$MACS2_noDup_inPeakN/FRiP_data$UniqueFragNum * 100
FRiP_data$MACS2_withDup_FRiP = FRiP_data$MACS2_withDup_inPeakN/FRiP_data$MappedFragNum_hg19 * 100
FRiP_data$ENCODE_noDup_FRiP = FRiP_data$ENCODE_noDup_inPeakN/FRiP_data$UniqueFragNum * 100
FRiP_data$ENCODE_withDup_FRiP = FRiP_data$ENCODE_withDup_inPeakN/FRiP_data$MappedFragNum_hg19 * 100
FRiP_dataFRiP_summary = FRiP_data[c("Antibody", "SEACR_noDup_FRiP", "SEACR_withDup_FRiP", "MACS2_noDup_FRiP", "MACS2_withDup_FRiP", "ENCODE_noDup_FRiP", "ENCODE_withDup_FRiP")]
FRiP_summary_melt = melt(FRiP_summary, id.vars = c("Antibody"))
FRiP_plot = ggplot(FRiP_summary_melt, aes(Antibody, value)) +
geom_bar(aes(fill = variable),
width = 0.7, position = position_dodge(width = 0.75), stat = "identity") +
ylim(0, 100) +
ylab("% of fragments in peaks") +
xlab("Antibody") +
ggtitle("Fragments in peaks") +
theme_bw()
FRiP_plothg19_blacklist = ChIPseeker::readPeakFile("/rds/general/user/la420/home/hg19/other/hg19-blacklist.v2.bed")
overlapping_big_peaks_SEACR = Reduce(IRanges::subsetByOverlaps, GenomicRanges::GRangesList(ActiveMotif_SEACR_big_peaks, Diagenode_100x_SEACR_big_peaks, Diagenode_50x_SEACR_big_peaks, Abcam_ab4729_SEACR_big_peaks, Abcam_ab177178_SEACR_big_peaks))
overlapping_big_peaks_SEACR## GRanges object with 4 ranges and 3 metadata columns:
## seqnames ranges strand | V4 V5
## <Rle> <IRanges> <Rle> | <integer> <integer>
## 2121 chr4 49093382-49113260 * | 158078 33
## 2124 chr4 49145035-49153402 * | 74858 42
## 2181 chr5 10002-11835 * | 5060479 3751
## 3024 chrX 155259552-155260465 * | 389928 1394
## V6
## <factor>
## 2121 chr4:49104570-49104576
## 2124 chr4:49151464-49151465
## 2181 chr5:11185-11186
## 3024 chrX:155260181-155260182
## -------
## seqinfo: 24 sequences from an unspecified genome; no seqlengths
blacklist_overlaps_SEACR = IRanges::subsetByOverlaps(overlapping_big_peaks_SEACR, hg19_blacklist)
blacklist_overlaps_SEACR## GRanges object with 4 ranges and 3 metadata columns:
## seqnames ranges strand | V4 V5
## <Rle> <IRanges> <Rle> | <integer> <integer>
## 2121 chr4 49093382-49113260 * | 158078 33
## 2124 chr4 49145035-49153402 * | 74858 42
## 2181 chr5 10002-11835 * | 5060479 3751
## 3024 chrX 155259552-155260465 * | 389928 1394
## V6
## <factor>
## 2121 chr4:49104570-49104576
## 2124 chr4:49151464-49151465
## 2181 chr5:11185-11186
## 3024 chrX:155260181-155260182
## -------
## seqinfo: 24 sequences from an unspecified genome; no seqlengths
overlapping_big_peaks_MACS2 = Reduce(IRanges::subsetByOverlaps, GenomicRanges::GRangesList(ActiveMotif_MACS2_big_peaks, Diagenode_100x_MACS2_big_peaks, Diagenode_50x_MACS2_big_peaks, Abcam_ab4729_MACS2_big_peaks, Abcam_ab177178_MACS2_big_peaks))
overlapping_big_peaks_MACS2## GRanges object with 7 ranges and 7 metadata columns:
## seqnames ranges strand | V4
## <Rle> <IRanges> <Rle> | <factor>
## 158 chr1 121484085-121485428 * | Di-Alexi_S2_q0.05_peak_158
## 290 chr1 249239905-249240618 * | Di-Alexi_S2_q0.05_peak_290
## 526 chr12 95186-95735 * | Di-Alexi_S2_q0.05_peak_526
## 618 chr12 133841523-133841891 * | Di-Alexi_S2_q0.05_peak_618
## 1817 chr5 10033-11765 * | Di-Alexi_S2_q0.05_peak_1821
## 2539 chrX 155259865-155260404 * | Di-Alexi_S2_q0.05_peak_2594
## 2565 chrY 59362877-59363410 * | Di-Alexi_S2_q0.05_peak_2620
## V5 V6 V7 V8 V9 V10
## <integer> <factor> <numeric> <numeric> <numeric> <integer>
## 158 1604 . 18.91184 166.38576 160.49411 1065
## 290 2444 . 25.5522 250.50439 244.47372 353
## 526 3763 . 32.68985 382.44391 376.35733 313
## 618 15264 . 36.66592 1532.84814 1526.46899 200
## 1817 26993 . 13.2136 2708.82251 2699.33105 1154
## 2539 15726 . 34.80879 1579.05383 1572.65308 317
## 2565 15713 . 34.7993 1577.76953 1571.36902 311
## -------
## seqinfo: 24 sequences from an unspecified genome; no seqlengths
blacklist_overlaps_MACS2 = IRanges::subsetByOverlaps(overlapping_big_peaks_MACS2, hg19_blacklist)
blacklist_overlaps_MACS2## GRanges object with 7 ranges and 7 metadata columns:
## seqnames ranges strand | V4
## <Rle> <IRanges> <Rle> | <factor>
## 158 chr1 121484085-121485428 * | Di-Alexi_S2_q0.05_peak_158
## 290 chr1 249239905-249240618 * | Di-Alexi_S2_q0.05_peak_290
## 526 chr12 95186-95735 * | Di-Alexi_S2_q0.05_peak_526
## 618 chr12 133841523-133841891 * | Di-Alexi_S2_q0.05_peak_618
## 1817 chr5 10033-11765 * | Di-Alexi_S2_q0.05_peak_1821
## 2539 chrX 155259865-155260404 * | Di-Alexi_S2_q0.05_peak_2594
## 2565 chrY 59362877-59363410 * | Di-Alexi_S2_q0.05_peak_2620
## V5 V6 V7 V8 V9 V10
## <integer> <factor> <numeric> <numeric> <numeric> <integer>
## 158 1604 . 18.91184 166.38576 160.49411 1065
## 290 2444 . 25.5522 250.50439 244.47372 353
## 526 3763 . 32.68985 382.44391 376.35733 313
## 618 15264 . 36.66592 1532.84814 1526.46899 200
## 1817 26993 . 13.2136 2708.82251 2699.33105 1154
## 2539 15726 . 34.80879 1579.05383 1572.65308 317
## 2565 15713 . 34.7993 1577.76953 1571.36902 311
## -------
## seqinfo: 24 sequences from an unspecified genome; no seqlengths
sampleList_with_ENCODE = c(sampleList_all, "ENCODE_H3K27ac_narrowPeaks", "ENCODE_H3K27me3_narrowPeaks")
peakList_with_ENCODE = c(peakList, ENCODE_H3K27ac, ENCODE_H3K27me3) %>% setNames(sampleList_with_ENCODE)
peakList_with_ENCODE_no_blacklist = list()
for(i in 1:length(peakList_with_ENCODE)){
de_blacklisted = IRanges::subsetByOverlaps(peakList_with_ENCODE[[i]], hg19_blacklist, invert = TRUE)
peakList_with_ENCODE_no_blacklist = c(peakList_with_ENCODE_no_blacklist, de_blacklisted)
}
peakList_with_ENCODE_no_blacklist = setNames(peakList_with_ENCODE_no_blacklist, sampleList_with_ENCODE)blacklisted_peaks_count = lapply(peakList_with_ENCODE, function(x){length(IRanges::subsetByOverlaps(x, hg19_blacklist))}) %>% unlist() %>% unname()
Total_peaks_ENCODE = data.frame(Antibody = c("ENCODE_H3K27ac", "ENCODE_H3K27me"), Total_PeakN = c(length(ENCODE_H3K27ac), length(ENCODE_H3K27me3)))
Total_peaks_with_ENCODE = rbind(Total_CT_peaks, Total_peaks_ENCODE)
blacklisted = data.frame(Sample = sampleList_with_ENCODE, Total_peaks = Total_peaks_with_ENCODE$Total_PeakN, Blacklisted_peaks = blacklisted_peaks_count)
blacklisted$Proportion = blacklisted$Blacklisted_peaks / blacklisted$Total_peaks * 100
blacklistedblacklist_plot = ggplot(blacklisted) +
geom_col(aes(x = Sample, y = Proportion, fill = Sample), show.legend = FALSE) +
ylim(0, 20) +
ylab("Blacklisted peaks") +
xlab("Sample") +
ggtitle("Proportion of C&T peaks in blacklist") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
blacklist_plotChIPseeker::covplot(peak = peakList_with_ENCODE_no_blacklist$ActiveMotif_SEACR,
weightCol = "V4",
title = "CUT&TAG ActiveMotif (1:100) H3K27ac peaks (SEACR)")
ChIPseeker::covplot(peak = peakList_with_ENCODE_no_blacklist$ActiveMotif_MACS2,
weightCol = "V5",
title = "CUT&TAG ActiveMotif (1:100) H3K27ac peaks (MACS2)")ChIPseeker::covplot(peak = peakList_with_ENCODE_no_blacklist$Diagenode_100x_SEACR,
weightCol = "V4",
title = "CUT&TAG Diagenode (1:100) H3K27ac peaks (SEACR)")
ChIPseeker::covplot(peak = peakList_with_ENCODE_no_blacklist$Diagenode_100x_MACS2,
weightCol = "V5",
title = "CUT&TAG Diagenode (1:100) H3K27ac peaks (MACS2)")ChIPseeker::covplot(peak = peakList_with_ENCODE_no_blacklist$Diagenode_50x_SEACR,
weightCol = "V4",
title = "CUT&TAG Diagenode (1:50) H3K27ac peaks (SEACR)")
ChIPseeker::covplot(peak = peakList_with_ENCODE_no_blacklist$Diagenode_50x_MACS2,
weightCol = "V5",
title = "CUT&TAG Diagenode (1:50) H3K27ac peaks (MACS2)")ChIPseeker::covplot(peak = peakList_with_ENCODE_no_blacklist$`Abcam-ab4729_SEACR`,
weightCol = "V4",
title = "CUT&TAG Abcam-ab4729 (1:100) H3K27ac peaks (SEACR)")
ChIPseeker::covplot(peak = peakList_with_ENCODE_no_blacklist$`Abcam-ab4729_MACS2`,
weightCol = "V5",
title = "CUT&TAG Abcam-ab4729 (1:100) H3K27ac peaks (MACS2)")ChIPseeker::covplot(peak = peakList_with_ENCODE_no_blacklist$`Abcam-ab177178_SEACR`,
weightCol = "V4",
title = "CUT&TAG Abcam-ab177178 (1:100) H3K27ac peaks (SEACR)")
ChIPseeker::covplot(peak = peakList_with_ENCODE_no_blacklist$`Abcam-ab177178_SEACR`,
weightCol = "V5",
title = "CUT&TAG Abcam-ab177178 (1:100) H3K27ac peaks (MACS2)")ENCODE_H3K27ac_overlapping_CT_peaks = c()
ENCODE_H3K27me3_overlapping_CT_peaks = c()
for(i in 1:length(peakList)){
overlaps_H3K27ac = GenomicRanges::countOverlaps(query = peakList[[i]], subject = ENCODE_H3K27ac)
overlaps_H3K27me3 = GenomicRanges::countOverlaps(query = peakList[[i]], subject = ENCODE_H3K27me3)
CT_in_ENCODE_H3K27ac = overlaps_H3K27ac[overlaps_H3K27ac != 0] %>% length()
CT_in_ENCODE_H3K27me3 = overlaps_H3K27me3[overlaps_H3K27me3 != 0] %>% length()
ENCODE_H3K27ac_overlapping_CT_peaks = c(ENCODE_H3K27ac_overlapping_CT_peaks, CT_in_ENCODE_H3K27ac)
ENCODE_H3K27me3_overlapping_CT_peaks = c(ENCODE_H3K27me3_overlapping_CT_peaks, CT_in_ENCODE_H3K27me3)
}
CT_in_ENCODE = data.frame(Antibody = sampleList_all, CT_peaks_in_ENCODE_H3K27ac = ENCODE_H3K27ac_overlapping_CT_peaks, CT_peaks_in_ENCODE_H3K27me3 = ENCODE_H3K27me3_overlapping_CT_peaks)
CT_in_ENCODECollect C&T peaks in ENCODE:
CT_peaks_in_ENCODE_H3K27ac_set = c()
CT_peaks_in_ENCODE_H3K27me3_set = c()
for(i in 1:length(peakList)){
in_ENCODE_H3K27ac = IRanges::subsetByOverlaps(peakList[[i]], ENCODE_H3K27ac)
in_ENCODE_H3K27me3 = IRanges::subsetByOverlaps(peakList[[i]], ENCODE_H3K27me3)
CT_peaks_in_ENCODE_H3K27ac_set = c(CT_peaks_in_ENCODE_H3K27ac_set, in_ENCODE_H3K27ac)
CT_peaks_in_ENCODE_H3K27me3_set = c(CT_peaks_in_ENCODE_H3K27me3_set, in_ENCODE_H3K27me3)
}
CT_peaks_in_ENCODE_H3K27ac_set = setNames(CT_peaks_in_ENCODE_H3K27ac_set, sampleList_all)
CT_peaks_in_ENCODE_H3K27me3_set = setNames(CT_peaks_in_ENCODE_H3K27me3_set, sampleList_all)Collect C&T peaks not in ENCODE:
CT_peaks_not_in_ENCODE_H3K27ac_set = c()
CT_peaks_not_in_ENCODE_H3K27me3_set = c()
for(i in 1:length(peakList)){
not_in_ENCODE_H3K27ac = IRanges::subsetByOverlaps(peakList[[i]], ENCODE_H3K27ac, invert = TRUE)
not_in_ENCODE_H3K27me3 = IRanges::subsetByOverlaps(peakList[[i]], ENCODE_H3K27me3, invert = TRUE)
CT_peaks_not_in_ENCODE_H3K27ac_set = c(CT_peaks_not_in_ENCODE_H3K27ac_set, not_in_ENCODE_H3K27ac)
CT_peaks_not_in_ENCODE_H3K27me3_set = c(CT_peaks_not_in_ENCODE_H3K27me3_set, not_in_ENCODE_H3K27me3)
}
CT_peaks_not_in_ENCODE_H3K27ac_set = setNames(CT_peaks_not_in_ENCODE_H3K27ac_set, sampleList_all)
CT_peaks_not_in_ENCODE_H3K27me3_set = setNames(CT_peaks_not_in_ENCODE_H3K27me3_set, sampleList_all)captured_H3K27ac_ENCODE_list = c()
captured_H3K27me3_ENCODE_list = c()
for(i in 1:length(peakList)){
overlaps_H3K27ac = GenomicRanges::countOverlaps(query = ENCODE_H3K27ac, subject = peakList[[i]])
overlaps_H3K27me3 = GenomicRanges::countOverlaps(query = ENCODE_H3K27me3, subject = peakList[[i]])
ENCODE_H3K27ac_captured_peaks = overlaps_H3K27ac[overlaps_H3K27ac != 0] %>% length()
ENCODE_H3K27me3_captured_peaks = overlaps_H3K27me3[overlaps_H3K27me3 != 0] %>% length()
captured_H3K27ac_ENCODE_list = c(captured_H3K27ac_ENCODE_list, ENCODE_H3K27ac_captured_peaks)
captured_H3K27me3_ENCODE_list = c(captured_H3K27me3_ENCODE_list, ENCODE_H3K27me3_captured_peaks)
}
Total_captured_ENCODE_peaks = data.frame(Antibody = sampleList_all, ENCODE_H3K27ac_captured_peaks = captured_H3K27ac_ENCODE_list, ENCODE_H3K27me3_captured_peaks = captured_H3K27me3_ENCODE_list)
Total_captured_ENCODE_peaksCollect ENCODE peaks in C&T:
ENCODE_H3K27ac_peaks_in_CT_set = c()
ENCODE_H3K27me3_peaks_in_CT_set = c()
for(i in 1:length(peakList)){
in_CT_H3K27ac = IRanges::subsetByOverlaps(ENCODE_H3K27ac, peakList[[i]])
in_CT_H3K27me3 = IRanges::subsetByOverlaps(ENCODE_H3K27me3, peakList[[i]])
ENCODE_H3K27ac_peaks_in_CT_set = c(ENCODE_H3K27ac_peaks_in_CT_set, in_CT_H3K27ac)
ENCODE_H3K27me3_peaks_in_CT_set = c(ENCODE_H3K27me3_peaks_in_CT_set, in_CT_H3K27me3)
}
ENCODE_H3K27ac_peaks_in_CT_set = setNames(ENCODE_H3K27ac_peaks_in_CT_set, sampleList_all)
ENCODE_H3K27me3_peaks_in_CT_set = setNames(ENCODE_H3K27me3_peaks_in_CT_set, sampleList_all)Collect ENCODE peaks not in C&T:
ENCODE_H3K27ac_peaks_not_in_CT_set = c()
ENCODE_H3K27me3_peaks_not_in_CT_set = c()
for(i in 1:length(peakList)){
not_in_CT_H3K27ac = IRanges::subsetByOverlaps(ENCODE_H3K27ac, peakList[[i]], invert = TRUE)
not_in_CT_H3K27me3 = IRanges::subsetByOverlaps(ENCODE_H3K27me3, peakList[[i]], invert = TRUE)
ENCODE_H3K27ac_peaks_not_in_CT_set = c(ENCODE_H3K27ac_peaks_not_in_CT_set, not_in_CT_H3K27ac)
ENCODE_H3K27me3_peaks_not_in_CT_set = c(ENCODE_H3K27me3_peaks_not_in_CT_set, not_in_CT_H3K27me3)
}
ENCODE_H3K27ac_peaks_not_in_CT_set = setNames(ENCODE_H3K27ac_peaks_not_in_CT_set, sampleList_all)
ENCODE_H3K27me3_peaks_not_in_CT_set = setNames(ENCODE_H3K27me3_peaks_not_in_CT_set, sampleList_all)ENCODE_overlaps = Total_captured_ENCODE_peaks %>% left_join(Total_CT_peaks, by = "Antibody") %>% left_join(CT_in_ENCODE, by = "Antibody")
ENCODE_overlaps$Percentage_of_CT_in_ENCODE_H3K27ac = ENCODE_overlaps$CT_peaks_in_ENCODE_H3K27ac / ENCODE_overlaps$Total_PeakN * 100
ENCODE_overlaps$Percentage_of_total_ENCODE_H3K27ac = ENCODE_overlaps$ENCODE_H3K27ac_captured_peaks / length(ENCODE_H3K27ac) * 100
ENCODE_overlaps$Percentage_of_CT_in_ENCODE_H3K27me3 = ENCODE_overlaps$CT_peaks_in_ENCODE_H3K27me3 / ENCODE_overlaps$Total_PeakN * 100
ENCODE_overlaps$Percentage_of_total_ENCODE_H3K27me3 = ENCODE_overlaps$ENCODE_H3K27me3_captured_peaks / length(ENCODE_H3K27me3) * 100
ENCODE_overlaps_H3K27ac = ENCODE_overlaps[c("Antibody", "Total_PeakN", "CT_peaks_in_ENCODE_H3K27ac", "Percentage_of_CT_in_ENCODE_H3K27ac", "ENCODE_H3K27ac_captured_peaks", "Percentage_of_total_ENCODE_H3K27ac")]
ENCODE_overlaps_H3K27acENCODE_overlaps_H3K27me3 = ENCODE_overlaps[c("Antibody", "Total_PeakN", "CT_peaks_in_ENCODE_H3K27me3", "Percentage_of_CT_in_ENCODE_H3K27me3", "ENCODE_H3K27me3_captured_peaks", "Percentage_of_total_ENCODE_H3K27me3")]
ENCODE_overlaps_H3K27me3ENCODE_overlaps$Antibody = factor(ENCODE_overlaps$Antibody, levels = ENCODE_overlaps$Antibody)
Peaks_in_ENCODE_plot = ggplot(ENCODE_overlaps) +
geom_col(aes(x = Antibody, y = Percentage_of_CT_in_ENCODE_H3K27ac, fill = Antibody), show.legend = FALSE) +
ylim(0, 100) +
ylab("% of sample peaks falling into ENCODE") +
xlab("Antibody") +
ggtitle("Proportion of C&T peaks falling into ENCODE (H3K27ac)") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Peaks_in_ENCODE_plotPeaks_in_ENCODE_plot = ggplot(ENCODE_overlaps) +
geom_col(aes(x = Antibody, y = Percentage_of_CT_in_ENCODE_H3K27me3, fill = Antibody), show.legend = FALSE) +
ylim(0, 100) +
ylab("% of sample peaks falling into ENCODE") +
xlab("Antibody") +
ggtitle("Proportion of C&T peaks falling into ENCODE (H3K27me3)") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Peaks_in_ENCODE_plotPeaks_in_CT_plot = ggplot(ENCODE_overlaps) +
geom_col(aes(x = Antibody, y = Percentage_of_total_ENCODE_H3K27ac, fill = Antibody), show.legend = FALSE) +
ylim(0, 100) +
ylab("% of ENCODE peaks captured by sample") +
xlab("Antibody") +
ggtitle("Proportion of ENCODE peaks captured by C&T (H3K27ac)") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Peaks_in_CT_plotPeaks_in_CT_plot = ggplot(ENCODE_overlaps) +
geom_col(aes(x = Antibody, y = Percentage_of_total_ENCODE_H3K27me3, fill = Antibody), show.legend = FALSE) +
ylim(0, 100) +
ylab("% of ENCODE peaks captured by sample") +
xlab("Antibody") +
ggtitle("Proportion of ENCODE peaks captured by C&T (H3K27me3)") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Peaks_in_CT_plotpeakFiles_list = list.files(path = "/rds/general/user/la420/home/CUTnTAG/antibodies/all_peaks", pattern = "\\.bed$", all.files = FALSE, full.names = TRUE, recursive = FALSE, ignore.case = FALSE, include.dirs = FALSE)
ENCODE_H3K27ac_path = file.path("/rds/general/user/la420/home/CUTnTAG/antibodies/all_peaks/ENCODE/H3K27ac_ENCFF044JNJ.bed.narrowPeak")
ENCODE_H3K27me3_path = file.path("/rds/general/user/la420/home/CUTnTAG/antibodies/all_peaks/ENCODE/H3K27me3_ENCFF126QYP.bed.narrowPeak")
txdb = TxDb.Hsapiens.UCSC.hg19.knownGene
testOverlap_H3K27ac = function(qPeak){
ChIPseeker::enrichPeakOverlap(queryPeak = qPeak,
targetPeak = ENCODE_H3K27ac_path,
TxDb = txdb,
pAdjustMethod = "BH",
nShuffle = 500,
chainFile = NULL,
verbose = F)
}
testOverlap_H3K27me3 = function(qPeak){
ChIPseeker::enrichPeakOverlap(queryPeak = qPeak,
targetPeak = ENCODE_H3K27me3_path,
TxDb = txdb,
pAdjustMethod = "BH",
nShuffle = 500,
chainFile = NULL,
verbose = F)
}promoters = ChIPseeker::getPromoters(TxDb = TxDb.Hsapiens.UCSC.hg19.knownGene, upstream = 3000, downstream = 3000)
tagMatrixList = lapply(peakList, getTagMatrix, windows = promoters)
plot = ChIPseeker::plotAvgProf(tagMatrixList, xlim = c(-3000, 3000), conf = 0.95, resample = 500, facet = "row")## >> Running bootstrapping for tag matrix... 2021-05-16 20:08:38
## >> Running bootstrapping for tag matrix... 2021-05-16 20:08:46
## >> Running bootstrapping for tag matrix... 2021-05-16 20:09:04
## >> Running bootstrapping for tag matrix... 2021-05-16 20:09:22
## >> Running bootstrapping for tag matrix... 2021-05-16 20:09:49
## >> Running bootstrapping for tag matrix... 2021-05-16 20:10:19
## >> Running bootstrapping for tag matrix... 2021-05-16 20:10:49
## >> Running bootstrapping for tag matrix... 2021-05-16 20:11:06
## >> Running bootstrapping for tag matrix... 2021-05-16 20:11:37
## >> Running bootstrapping for tag matrix... 2021-05-16 20:12:12
## >> Running bootstrapping for tag matrix... 2021-05-16 20:12:23
## >> Running bootstrapping for tag matrix... 2021-05-16 20:12:32
ENCODE_names = c("ENCODE_H3K27ac_narrowPeaks", "ENCODE_H3K27me3_narrowPeaks")
ENCODE_peakList = list(ENCODE_H3K27ac, ENCODE_H3K27me3) %>% setNames(ENCODE_names)
tagMatrixList_ENCODE = lapply(ENCODE_peakList, getTagMatrix, windows = promoters)
plot = ChIPseeker::plotAvgProf(tagMatrixList_ENCODE, xlim = c(-3000, 3000), conf = 0.95, resample = 500, facet = "row")## >> Running bootstrapping for tag matrix... 2021-05-16 20:13:43
## >> Running bootstrapping for tag matrix... 2021-05-16 20:14:04
peakdir = "/rds/general/user/la420/home/CUTnTAG/antibodies_withDup/all_peaks/"
sampleList_all_withDup = c("ActiveMotif_SEACR_withDup", "ActiveMotif_MACS2_withDup", "Diagenode_100x_SEACR_withDup", "Diagenode_100x_MACS2_withDup", "Diagenode_50x_SEACR_withDup", "Diagenode_50x_MACS2_withDup", "Abcam-ab177178_SEACR_withDup", "Abcam-ab177178_MACS2_withDup", "Abcam-ab4729_SEACR_withDup", "Abcam-ab4729_MACS2_withDup", "H3K27me3_SEACR_withDup", "H3K27me3_MACS2_withDup")
peakList_withDup = list()
for(sample in sampleList_all_withDup){
path = paste0(peakdir, sample, ".bed")
peak = ChIPseeker::readPeakFile(file.path(path), as = "GRanges") %>% BRGenomics::tidyChromosomes(keep.X = TRUE, keep.Y = TRUE)
peakList_withDup = c(peakList_withDup, peak)
}
peakList_withDup = setNames(peakList_withDup, sampleList_all_withDup)Chromatin State Segmentation by HMM: http://genome.ucsc.edu/cgi-bin/hgFileUi?g=wgEncodeBroadHmm&db=hg19
chrHMM_url = "http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeBroadHmm/wgEncodeBroadHmmK562HMM.bed.gz"
chrHMM = genomation::readBed(chrHMM_url)
chrHMM_list = GenomicRanges::split(chrHMM, chrHMM$name, drop = TRUE)annotation = genomation::annotateWithFeatures(GenomicRanges::GRangesList(peakList_with_ENCODE), chrHMM_list)
genomation::heatTargetAnnotation(annotation)get_stats = function(anno){genomation::getTargetAnnotationStats(anno, percentage = TRUE, precedence = FALSE)}
anno_set = lapply(annotation, get_stats) %>% bind_rows() %>% t() %>% as.matrix()
corr_anno_set = cor(anno_set, method = "pearson")
colnames(corr_anno_set) = sampleList_with_ENCODE
rownames(corr_anno_set) = sampleList_with_ENCODE
corr_anno_set = corr_anno_set %>% round(3)
col = colorRampPalette(brewer.pal(8, "PuOr"))(200)
corrplot(corr_anno_set, method = "color", col = col, type = "upper", order = "original", tl.col = "black", tl.srt = 50, tl.cex = 0.5, addCoef.col = "black", number.cex = 0.5)annotation_dup_only = genomation::annotateWithFeatures(GenomicRanges::GRangesList(duplicate_only_peaks), chrHMM_list)
genomation::heatTargetAnnotation(annotation_dup_only)annotation_in_ENCODE_H3K27ac = genomation::annotateWithFeatures(GenomicRanges::GRangesList(CT_peaks_in_ENCODE_H3K27ac_set), chrHMM_list)
genomation::heatTargetAnnotation(annotation_in_ENCODE_H3K27ac)annotation_not_in_ENCODE_H3K27ac = genomation::annotateWithFeatures(GenomicRanges::GRangesList(CT_peaks_not_in_ENCODE_H3K27ac_set), chrHMM_list)
genomation::heatTargetAnnotation(annotation_not_in_ENCODE_H3K27ac)genes = lapply(peakAnnoList, function(i) as.data.frame(i)$geneId)
names(genes) = sub("_", "\n", names(genes))
compKEGG = clusterProfiler::compareCluster(geneCluster = genes,
fun = "enrichKEGG",
pvalueCutoff = 0.05,
pAdjustMethod = "BH")
clusterProfiler::dotplot(compKEGG, showCategory = 15, title = "KEGG Pathway Enrichment Analysis")motifdir = "/rds/general/user/la420/home/CUTnTAG/antibodies/motifs/"
sampleList_all_motifs = c("ActiveMotif_SEACR", "ActiveMotif_MACS2", "Diagenode_100x_SEACR", "Diagenode_100x_MACS2", "Diagenode_50x_SEACR", "Diagenode_50x_MACS2", "Abcam-ab177178_SEACR", "Abcam-ab177178_MACS2", "Abcam-ab4729_SEACR", "Abcam-ab4729_MACS2", "H3K27me3_SEACR", "H3K27me3_MACS2", "ENCODE_H3K27ac", "ENCODE_H3K27me3", "ENCODE_H3K27me3_2")
motifs = list()
for(i in 1:length(sampleList_all_motifs)){
motifs[[i]] = read.csv(paste0(motifdir, sampleList_all_motifs[i], "/knownResults.txt"), sep = "\t", header = TRUE) %>% data.frame()
}motif_list = motifs[[1]][,1]
motif_summary = data.frame(Motif.Name = motif_list)
for(i in 1:length(motifs)){
values = motifs[[i]][,c(1,3)]
motif_summary = left_join(motif_summary, values, by = "Motif.Name")
}
colnames(motif_summary) = c("Motif", sampleList_all_motifs)motif_set = motif_summary[2:ncol(motif_summary)]
for(i in 1:length(motifs)){
motif_set[i] = format(motif_set[i], scientific = FALSE) %>% unlist %>% as.numeric
}
corr_motif_set = cor(motif_set, method = "pearson") %>% round(3)col = colorRampPalette(brewer.pal(8, "PuOr"))(200)
heatmap.2(x = corr_motif_set, col = col, trace = "none", density.info = "none", cexRow = 1, cexCol = 1, cellnote = corr_motif_set, notecex = 1, notecol = "black", margins = c(12, 12), lhei = c(1, 5), lwid = c(1, 5))